 /*	This program creates a working dataset for the Main Sample, 
to use in the prediction of retirement based on retirement prediction model */

***** Set directories 
local dir_raw 		"~/Dropbox/Retirement gaming/raw"
local dir_do 		"~/Dropbox/Retirement gaming/dataverse"
local dir_clean 	"~/Dropbox/Retirement gaming/clean"
local dir_output 	"~/Dropbox/Retirement gaming/output/dataverse"

local dataname "retirmodeldata_mainsample.dta" 


use "`dir_clean'/admindata.dta", clear

order i t j 
sort i t j


****** Keep industria & comercio
drop aportaci_min 
drop if aportaci!=1 // Caja de aportacion; 1=industria y comercio, 
		//2=civil (org publico), 3=rural, 4=construccion, 5=notarial, 6=bancaria, 48=servicio domestico
		
*** Gender in sample
drop if sexo!=1 // drop females

*** Identify and drop people in early retirement regimes
drop if vf_min==103 // these are people working while already retired
drop if vf_min==97 | vf_min==98 // people reported with no service to the firm

****** RETIREMENT AGE
g tegr=mofd(Fegr)
format tegr %tm
g retirnow=(t==tegr & causal_5==1) // reported as leaving due to retirement
tab retirnow
* keep only first retirement obs
cap drop aux
g aux=t if retirnow==1
bysort i: egen tretir=min(aux)
format tretir %tm
drop aux
g postretir=0
replace postretir=1 if t>tretir
replace retirnow=0 if postretir==1
bysort i: egen retir_insample=max(retirnow)
tab retir_insample postretir

*** Drop people with employement after retirement
g flag= W>0 & W<. & postretir==1
bysort i: egen iflag=max(flag)
drop if iflag==1
drop flag iflag


*** AGE AND MONTH OF OBSERVATION CENTERED 
*birthdate
format Fnac %td
g birth_month=mofd(Fnac)
format birth_month %tm
g refbday_month=birth_month+(12*50)
format refbday_month %tm
* age in months centered at 50's birthday
g agemonths_centered = t-refbday_month // this is number of months after ref age birthday
g agemonths=agemonths_centered+12*50
* age in years centered at 50
g ageyears=agemonths/12
cap drop age
rename ageyears age
g agedisc = floor(age) // age in integer years

*** Cohorts in sample
g cohort=yofd(Fnac)
keep if birth_month>=tm(1941m4) & birth_month<tm(1971m4)

* Keep only salaried work observations 
g sample_salary = tipREM_min==1 

*** Keep only main job and drop duplicates
duplicates tag i t, g(tag)
// prioritize (among duplicates) jobs with salary pay over jobs with daily or commission pay
tab tag sample_salary
bys i t: egen maxsal=max(sample_salary) 
drop if tag>0 & maxsal==1 & sample_salary==0
drop tag maxsal
duplicates tag i t, g(tag)
bysort i t: egen maxrem=max(W)
drop if tag>0 & W<maxrem
drop tag
drop maxrem
duplicates tag i t, g(tag)
tab tag
sort i t
duplicates drop i t W, force
drop tag  

duplicates tag i t, g(tag)
drop if tag>0 & (W==0 | W==.)
drop tag

****************************************
** REFERENCE PERIOD
****************************************
* Mark obs in reference period (age 45-59 or first year observed) 
g byte markrefp=(agedisc>=45 & agedisc<=49)
bys i: egen byte anyobs4549=max(markrefp) // indicates if a person is observed at 45-49
sort i t
bysort i: g order=_n

cap drop aux
g aux=agedisc if agedisc>=50
bys i: egen age1stobs=min(aux) // min age observed after age 50
drop aux
tab age1stobs anyobs4549, m 

replace markrefp=1 if agedisc==age1stobs & anyobs4549==0
bys i: egen anyrefp=max(markrefp) // this is to check that everyone has some obs in ref period
sum agedisc if markrefp==1


****************************************
** VARIABLES TO BE USED IN PREDICTION MODEL
****************************************

*SELF-EMPLOYED
g self_empl=status_1==1 

*EMPLOYED
g empl=status_3==1 

*** Replace earnings outliers //treat them as missing for average earnings, but keep the observations to calculate density of contribution 
// Note: use the same outlier thresholds used in our main dataset
sum W Wben , det 
g aux=Wben
replace Wben=. if W<3.6046 
replace Wben=. if W>81.3101 & W!=. 
replace Wben=. if Wben>85.0847 
replace W=. if W<3.6046 
replace W=. if W>81.3101 


* Mark our reference age group observations
g agerange=(agedisc>=45 & agedisc<=57)

* Count months employed and self-employed at 45-57
bys i: egen count_empl=sum(empl*agerange)
bys i: egen count_self_empl=sum(self_empl*agerange)

* salaried work observations
g salary=tipREM==1 

*Firm size 
bysort self_empl: sum ndep
g noempl= ndep==0
g micro1= ndep>0 & ndep<5
g micro2= ndep>=5 & ndep<10
g larger= ndep>=10

*Firm sector: mode in reference period
// 2-digit CIIU
cap drop aux
tostring ciiu, g(aux)
g ciiu2=substr(aux,1,2)
destring ciiu2, replace
drop aux

gen 	ciiu1 = 1 if ciiu2<10											// Agriculture and mining	
replace ciiu1 = 2 if (ciiu2>=10 & ciiu2<=33) | ciiu2==95 				// Manufacturing 
replace ciiu1 = 3 if ciiu2>=35 & ciiu2<=39 								// Energy and waste disposal
replace ciiu1 = 4 if ciiu2>=41 & ciiu2<=43  							// Construction
replace ciiu1 = 5 if (ciiu2>=45 & ciiu2<=47) | (ciiu2>=55 & ciiu2<=56)	// wholesale and retail, restaurants, hotels
replace ciiu1 = 6 if (ciiu2>=49 & ciiu2<=53) | ciiu2==61				// transport and communications 
replace ciiu1 = 7 if (ciiu2>=62 & ciiu2<=82) | ciiu2==96				// services
replace ciiu1 = 8 if (ciiu2>=84 & ciiu2<=94) | (ciiu2>=58 & ciiu2<=60)| ciiu2==97 // public admin, social and domestic services

g manufacturing		= ciiu1==2
g retailhospitality	= ciiu1==5 
g transportenergy	= ciiu1==6 | ciiu1==3
g services 			= ciiu1==7 | ciiu1==8 | ciiu1==4 


*JOB CHANGES
xtset i t
sort i t
cap drop auxj
bys i: gen auxj=l1.j
sort i t
replace auxj=j[_n-1] if auxj==. & i[_n]==i[_n-1]
g jobchange=auxj!=j if auxj!=.


** Proportion of months of service registered since start of data **
sort i t
bys i: g served_mthscum = _n // number of months registered since start of database (April 1996) until current date
g max_mthscum = t-tm(1996m4)+1  // time elapsed since start of data
g propm_served = served_mthscum/max_mthscum // this is a running ratio, proportion of months served since 18th birthday

** Proportion of months of service registered since age 18 until start of data **
g tjobstart = mofd(Fing)
cap drop aux
g aux = (t-tjobstart) if t==tm(1996m4) // for people employed at start of dataset, how long they have been employed
replace aux=0 if aux==.
bys i: egen served_mthspredata = max(aux) // count reported months of tenure pre 1996
g max_mthsstart		= tm(1996m4) - (birth_month+12*18) // if 18th birthday happend before April 1996, count time since 18 until start of data
g max_mthsstart30	= tm(1996m4) - (birth_month+12*30) // if 30th birthday happend before April 1996, count time since 30 until start of data
g neg0_mthsstart30=max_mthsstart30<=0
replace max_mthsstart30=0 if max_mthsstart30<0
g propm_predata = served_mthspredata/max_mthsstart // this is a constant ratio, proportion of months served until start of data
g propm_predata30 = served_mthspredata/max_mthsstart30 // this is a constant ratio, proportion of months served until start of data
replace propm_predata=1 if propm_predata>1
replace propm_predata30=0 if max_mthsstart30==0
replace propm_predata30=1 if propm_predata30>1


** Proportion of months of service in the reference period **
*month when person should be observed at 45th birthday
g rfp_minmth=birth_month+45*12
format rfp_minmth %tm
sum rfp_minmth, format
replace rfp_minmth=tm(1996m4) if rfp_minmth<tm(1996m4)
*month when person should be observed at 50th birthday
g rfp_maxmth=birth_month+50*12
format rfp_maxmth %tm
sum rfp_maxmth, format
replace rfp_maxmth=tm(1996m4) if rfp_maxmth<tm(1996m4)
*number of months person can be observed between 45 and 50 years old
g max_mthsrefp=rfp_maxmth-rfp_minmth
*generate marker for people that should be observed at 45-49 but are not
g miss4549=anyobs4549==0 & max_mthsrefp>0
*replace max nr months to 12 if observed after age 50 for the first time
replace max_mthsrefp=12 if anyobs4549==0 // note: this includes some people who could be observed at ages 45-49 but are not
*count months observed in ref period
bys i: egen obs_mthsrefp=sum(markrefp)
*Proportion of months of service in the reference period 
g propm_refp=obs_mthsrefp/max_mthsrefp

tempfile data1
save `data1', replace

*******************************
** REPORTS MCB (SELF-EMPL) **
*******************************
*Add fictos
tempfile time
use `data1'
keep t 
duplicates drop 
save `time', replace
import delimited "`dir_raw'/FICTO UNIPERSONALES.csv", clear rowrange(5) varnames(5)
foreach var in valorbfc mgravado aportebps{
	destring `var', replace ignore(",")
}	
g t=mofd(date(fvigencia, "MY",2019))
format t %tm
sort t
merge 1:1 t using `time'
sort t
tset t
foreach var in valorbfc mgravado aportebps{
	replace `var' = l.`var' if `var'==.
}	
sort t
foreach var in valorbfc mgravado aportebps{
	replace `var' = `var'[1] if _n==2
}	
drop if _m==1
drop _m fvigencia
merge 1:m t using `data1'
drop _m

* MCB price adjusted
g rficto = mgravado/(1000*ipc)
label var rficto "Min contribution base in 1000 Pesos of 2015"

* Reports MCB
g Wround=round(W,.01)
g rfictoround=round(rficto,.01)
sum Wround W rfictoround rficto
g reports_ficto= Wround==rfictoround if self_empl==1
sum reports_ficto if self_empl==1
replace reports_ficto=0 if self_empl!=1

g reports_overficto= Wround>rfictoround if self_empl==1
replace reports_overficto=0 if self_empl!=1

** COLLAPSE DATA TO INDIVIDUAL LEVEL (WITHIN REF PERIOD) **

keep if markrefp==1 // keep observations in the 45-49 period or the first year observed after 45

collapse (mean) propm_self_empl=self_empl propm_empl=empl W Wben mficto=reports_ficto moverficto=reports_overficto ///
				mean_noempl=noempl mean_micro1=micro1 mean_micro2=micro2 mean_larger=larger mean_manuf=manufacturing mean_retail=retailhospitality mean_transport=transportenergy mean_services=services  ///
		(first) first_noempl=noempl first_micro1=micro1 first_micro2=micro2 first_larger=larger first_manuf=manufacturing first_retail=retailhospitality first_transport=transportenergy first_services=services  ///
		(max) salary self_empl empl anyobs4549 ever_ficto=reports_ficto ever_overficto=reports_overficto jobchange ///
		(last) propm_served propm_predata max_mthsstart cohort propm_refp ///		
		, by(i)


save "`dir_clean'/`dataname'", replace


clear all
exit







